{
 "nbformat": 4,
 "nbformat_minor": 2,
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.6.3"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": "/Users/jiangzl/.virtualenvs/python3.6/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n  from ._conv import register_converters as _register_converters\nUsing TensorFlow backend.\n"
    }
   ],
   "source": [
    "import sys\n",
    "# 加载自定义包(添加：中间件)\n",
    "sys.path.append(\"src/py3.x/tensorflow2.x\")\n",
    "from text_Emotion import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "outfile = \"/opt/data/开源词向量/gensim_word2vec_60/Word60.model\"\n",
    "# 加载词向量\n",
    "Word2VecModel = loadMyWord2Vec(outfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "空间的词向量（60 维）: (60,) [ 2.2506642  -1.7324443   0.35593075 -3.7236977  -0.6317619   2.1253817\n -0.8911206   0.61192095 -2.5709946   5.6513844   2.3008282  -4.102604\n -0.61898416 -1.1190889  -6.060641    2.3529105   1.8131357   2.0764832\n -2.102738   -0.414962   -2.0553887   0.37966883 -2.015982   -1.4542716\n  3.191199    0.3265181   0.7307454   1.4761372  -2.2383723   0.925493\n  6.2617674  -1.3852879   0.6405419  -0.5601632  -1.084447    5.689829\n  0.46593904 -2.824275    4.2015862  -0.87934065  1.518804   -1.493514\n -1.9851282  -0.63166183  0.96814466  1.6375747   1.1566993   1.1981301\n  0.7950756  -3.0055897   1.2649575   1.2099069   1.9403213   1.3719954\n  2.6494706   1.8465079  -0.5507954  -2.3987298  -1.8990258  -4.651662  ]\n打印与空间最相近的5个词语： [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
    }
   ],
   "source": [
    "embeddings_matrix = load_embeding()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "--:  [[ 0.          0.          0.         ...  0.          0.\n   0.        ]\n [ 3.6153059   2.63272738 -0.98327219 ...  0.03685202 -0.78566265\n   1.06350613]\n [ 0.21444647  2.58100891  0.08306306 ... -0.43973923 -0.2102039\n  -1.37015963]\n ...\n [-1.07420349  1.90465117  2.2614491  ... -1.90614116 -0.34697708\n  -2.43622112]\n [ 1.53204441  0.60434735 -0.02905927 ... -0.04591536 -0.63762575\n   0.29778937]\n [ 0.20260553  0.03990031 -0.22745971 ... -0.17701624  0.16334218\n   0.06799572]]\n"
    }
   ],
   "source": [
    "print('--: ', embeddings_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import os\n",
    "import keras\n",
    "import random\n",
    "import gensim\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from keras import Model\n",
    "from keras.models import load_model\n",
    "from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input\n",
    "from keras.optimizers import Adam\n",
    "# 该目录下的 config.py文件， 数据文件是: poetry.txt\n",
    "from config import Config\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": "空间的词向量（60 维）: (60,) [ 2.2506642  -1.7324443   0.35593075 -3.7236977  -0.6317619   2.1253817\n -0.8911206   0.61192095 -2.5709946   5.6513844   2.3008282  -4.102604\n -0.61898416 -1.1190889  -6.060641    2.3529105   1.8131357   2.0764832\n -2.102738   -0.414962   -2.0553887   0.37966883 -2.015982   -1.4542716\n  3.191199    0.3265181   0.7307454   1.4761372  -2.2383723   0.925493\n  6.2617674  -1.3852879   0.6405419  -0.5601632  -1.084447    5.689829\n  0.46593904 -2.824275    4.2015862  -0.87934065  1.518804   -1.493514\n -1.9851282  -0.63166183  0.96814466  1.6375747   1.1566993   1.1981301\n  0.7950756  -3.0055897   1.2649575   1.2099069   1.9403213   1.3719954\n  2.6494706   1.8465079  -0.5507954  -2.3987298  -1.8990258  -4.651662  ]\n打印与空间最相近的5个词语： [('物件', 0.7354965806007385), ('维度', 0.7326242923736572), ('自由空间', 0.7247114181518555), ('拓扑', 0.7112817764282227), ('三维空间', 0.7062257528305054)]\n加载词向量结束..\n"
    },
    {
     "ename": "NameError",
     "evalue": "name 'load_data' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-18-afd80ed77829>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mEmotionModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mConfig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, config)\u001b[0m\n\u001b[1;32m     75\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     76\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     79\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/git/AiLearning/src/py3.x/tensorflow2.x/text_Emotion.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    123\u001b[0m         \u001b[0;34m'''训练模型'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    124\u001b[0m         \u001b[0membeddings_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_embeding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 125\u001b[0;31m         \u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    126\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membeddings_matrix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'load_data' is not defined"
     ]
    }
   ],
   "source": [
    "model = EmotionModel(Config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>label</th>\n      <th>comment</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>0</td>\n      <td>1</td>\n      <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n    </tr>\n    <tr>\n      <td>1</td>\n      <td>1</td>\n      <td>商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>1</td>\n      <td>早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>1</td>\n      <td>宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>1</td>\n      <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风</td>\n    </tr>\n    <tr>\n      <td>5</td>\n      <td>1</td>\n      <td>总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象</td>\n    </tr>\n    <tr>\n      <td>6</td>\n      <td>1</td>\n      <td>价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...</td>\n    </tr>\n    <tr>\n      <td>7</td>\n      <td>1</td>\n      <td>不错，在同等档次酒店中应该是值得推荐的！</td>\n    </tr>\n    <tr>\n      <td>8</td>\n      <td>1</td>\n      <td>入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味，房间内较新。房间大小合适，卫生间设备齐...</td>\n    </tr>\n    <tr>\n      <td>9</td>\n      <td>1</td>\n      <td>1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。...</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "   label                                            comment\n0      1  距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n1      1                       商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!\n2      1         早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n3      1  宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...\n4      1               CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\n5      1           总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象\n6      1  价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...\n7      1                               不错，在同等档次酒店中应该是值得推荐的！\n8      1  入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味，房间内较新。房间大小合适，卫生间设备齐...\n9      1  1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。..."
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel(\"src/py3.x/tensorflow2.x/EmotionData.xlsx\", header=0, error_bad_lines=False, encoding=\"utf_8_sig\")\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = df[\"label\"].tolist()\n",
    "y[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def func(line, ngrams=[]):\n",
    "    # 加入我们的组合词，保证分词的准确性\n",
    "        \n",
    "    if ngrams != []:\n",
    "        for word in ngrams:\n",
    "            jieba.add_word(\"\".join(word.lower()))\n",
    "    # # 将文本 ['1, 2, 3', '1, 2, .., n'] 分解为: [[1, 2, 3], [1, 2, .., n]]\n",
    "    words = [word for word in jieba.cut(str(line).lower(), cut_all=False)]\n",
    "    # print(\">>> \", train)\n",
    "    return \" \".join(words)\n",
    "x = df[\"comment\"].apply(lambda line: func(line))\n"
   ]
  }
 ]
}